import csv
import sqlite3
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import plotly.express as px
import numpy as np
from collections import Counter
import plotly.graph_objects as go
# Max size if not limited causing issue
csv.field_size_limit(sys.maxsize)
# lets take the latest dataset as instructed
data_file = 'data/clickstream-enwiki-2023-12.tsv'
def extract_yearmonth(file_path):
## Use regex to extract 4 digits of year and 2 for month
match = re.search(r'clickstream-enwiki-(\d{4}-\d{2})\.tsv', file_path)
# retun in format 202312
if match:
return match.group(1).replace('-', '') # Return yearmonth without the dash
else:
raise ValueError("Could not extract yearmonth from file name")
yearmonth = extract_yearmonth(data_file)
# Connect to SQLite db
conn = sqlite3.connect('clickstream_wiki.db')
cursor = conn.cursor()
# Drop table if exists
cursor.execute('DROP TABLE IF EXISTS clickstream_wiki')
# Create table
cursor.execute('''
CREATE TABLE IF NOT EXISTS clickstream_wiki(
prev TEXT,
curr TEXT,
type TEXT,
n INTEGER,
yearmonth TEXT
)
''')
conn.commit()
# open file in read mode (tab limited file)
with open(data_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f, delimiter='\t')
to_db = []
for row in reader:
# each row has 4 elements and the final element is a digit
if len(row) == 4 and row[3].isdigit():
prev, curr, link_type, n = row
to_db.append((prev, curr, link_type, int(n), yearmonth))
else:
pass
cursor.executemany('''
INSERT INTO clickstream_wiki(prev, curr, type, n, yearmonth) VALUES (?, ?, ?, ?, ?)
''', to_db)
conn.commit()
cursor.execute('SELECT COUNT(*) FROM clickstream_wiki')
# return the only result
total_rows = cursor.fetchone()[0]
print(f'Total rows: {total_rows}')
conn.close()
Total rows: 32927441
conn = sqlite3.connect('clickstream_wiki.db')
chunk_size = 100000 # size of chunks
# create list
chunks = []
# Read data
for chunk in pd.read_sql_query('SELECT * FROM clickstream_wiki', conn, chunksize=chunk_size):
chunks.append(chunk)
# Close conn
conn.close()
# concat all chunks in dataframe
df = pd.concat(chunks, ignore_index=True)
prev: The title of the Wikipedia article the user visited before the current one, or a special value indicating the source type (example 'other-search' may mean other search engines like Google, external of wikipedia).curr: The title of the current Wikipedia article the user visited.type: The type of referrer source. Possible values include:link: The user followed an internal link from another Wikipedia article.external: The user came from an external site (e.g., search engine).other: Other types of referrers.n: The number of occurrences of this (prev, curr, type) combination during the month.df.head()
| prev | curr | type | n | yearmonth | |
|---|---|---|---|---|---|
| 0 | Wethersfield,_Essex | The_Hundred_Parishes | link | 23 | 202312 |
| 1 | History_of_Paris | History_of_Paris_(1946–2000) | link | 16 | 202312 |
| 2 | other-search | The_Hundred_Parishes | external | 23 | 202312 |
| 3 | Paris | History_of_Paris_(1946–2000) | link | 18 | 202312 |
| 4 | other-search | Christian_McNeish | external | 11 | 202312 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32927441 entries, 0 to 32927440 Data columns (total 5 columns): # Column Dtype --- ------ ----- 0 prev object 1 curr object 2 type object 3 n int64 4 yearmonth object dtypes: int64(1), object(4) memory usage: 1.2+ GB
df.describe()
| n | |
|---|---|
| count | 3.292744e+07 |
| mean | 2.200678e+02 |
| std | 2.188026e+04 |
| min | 1.000000e+01 |
| 25% | 1.500000e+01 |
| 50% | 2.700000e+01 |
| 75% | 7.100000e+01 |
| max | 1.214930e+08 |
df.drop_duplicates(inplace=True)
# looks like no dupes after all
df.describe()
| n | |
|---|---|
| count | 3.292744e+07 |
| mean | 2.200678e+02 |
| std | 2.188026e+04 |
| min | 1.000000e+01 |
| 25% | 1.500000e+01 |
| 50% | 2.700000e+01 |
| 75% | 7.100000e+01 |
| max | 1.214930e+08 |
conn = sqlite3.connect('clickstream_wiki.db')
# Sample 10% = 3292744
query = f'''
SELECT *
FROM clickstream_wiki
WHERE rowid IN (
SELECT rowid
FROM clickstream_wiki
ORDER BY RANDOM()
LIMIT 3292744
)
'''
df_sample = pd.read_sql_query(query, conn)
conn.close()
df_sample.shape
(3292744, 5)
df_sample.describe()
| n | |
|---|---|
| count | 3.292744e+06 |
| mean | 2.138819e+02 |
| std | 3.171326e+03 |
| min | 1.000000e+01 |
| 25% | 1.500000e+01 |
| 50% | 2.700000e+01 |
| 75% | 7.100000e+01 |
| max | 1.905973e+06 |
df_sample.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3292744 entries, 0 to 3292743 Data columns (total 5 columns): # Column Dtype --- ------ ----- 0 prev object 1 curr object 2 type object 3 n int64 4 yearmonth object dtypes: int64(1), object(4) memory usage: 125.6+ MB
df_sample.head()
| prev | curr | type | n | yearmonth | |
|---|---|---|---|---|---|
| 0 | Wethersfield,_Essex | The_Hundred_Parishes | link | 23 | 202312 |
| 1 | other-search | History_of_Paris_(1946–2000) | external | 304 | 202312 |
| 2 | other-empty | Sparse_matrix_converter | external | 21 | 202312 |
| 3 | Yingluck_Shinawatra | Chalerm_Yubamrung | link | 17 | 202312 |
| 4 | other-empty | Chalerm_Yubamrung | external | 18 | 202312 |
# nulls
df_sample.isnull().sum()
prev 0 curr 0 type 0 n 0 yearmonth 0 dtype: int64
# drop dupes
df_sample = df_sample.drop_duplicates()
# coerce to right data tyep
df_sample['n'] = pd.to_numeric(df_sample['n'], errors='coerce')
df_sample['prev'] = df_sample['prev'].astype(str)
df_sample['curr'] = df_sample['curr'].astype(str)
# removing links to the same page (less useful for our analysis)
df_sample = df_sample[df_sample['prev'] != df_sample['curr']]
df_sample.describe()
| n | |
|---|---|
| count | 3.292744e+06 |
| mean | 2.138819e+02 |
| std | 3.171326e+03 |
| min | 1.000000e+01 |
| 25% | 1.500000e+01 |
| 50% | 2.700000e+01 |
| 75% | 7.100000e+01 |
| max | 1.905973e+06 |
np.quantile(df_sample['n'],[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.99,1])
array([1.000000e+01, 1.100000e+01, 1.400000e+01, 1.700000e+01,
2.100000e+01, 2.700000e+01, 3.700000e+01, 5.500000e+01,
9.500000e+01, 2.250000e+02, 5.070000e+02, 2.779000e+03,
1.905973e+06])
# For above 95 percentile links what is the composition
summary_above_500_n = df[df['n']>=500].groupby(['prev','curr'])['n'].sum().reset_index().sort_values('n',ascending = False)
summary_above_500_n['pc'] = summary_above_500_n['n']/summary_above_500_n['n'].sum() *100.0
summary_above_500_n
| prev | curr | n | pc | |
|---|---|---|---|---|
| 849302 | other-empty | Main_Page | 121492994 | 2.217457 |
| 800221 | other-empty | Hyphen-minus | 14966084 | 0.273157 |
| 1067400 | other-search | Animal_(2023_film) | 12146400 | 0.221693 |
| 1004960 | other-internal | Main_Page | 7001059 | 0.127781 |
| 1529326 | other-search | Salaar:_Part_1_–_Ceasefire | 6313615 | 0.115234 |
| ... | ... | ... | ... | ... |
| 1353665 | other-search | List_of_Celebrity_Juice_episodes | 500 | 0.000009 |
| 964544 | other-empty | Weapon_mount | 500 | 0.000009 |
| 1353765 | other-search | List_of_Chinese_Indonesians | 500 | 0.000009 |
| 246534 | Hans_Oster | Wilhelm_Canaris | 500 | 0.000009 |
| 48067 | Alex_G | Havertown,_Pennsylvania | 500 | 0.000009 |
1671491 rows × 4 columns
# Create the quantile plot using Plotly
fig = go.Figure()
# Add the quantile line
fig.add_trace(go.Scatter(
x=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95],
y=np.quantile(df_sample['n'], [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]),
mode='lines+markers',
marker=dict(color='blue'),
line=dict(color='blue'),
name='Quantiles'
))
# Update layout for better presentation
fig.update_layout(
title='Quantile Plot for Sample data',
xaxis_title='Quantile Levels',
yaxis_title='Quantile Values',
showlegend=False
)
# Show the plot
fig.show()
# Create the quantile plot using Plotly
fig = go.Figure()
# Add the quantile line
fig.add_trace(go.Scatter(
x=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95],
y=np.quantile(df['n'], [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]),
mode='lines+markers',
marker=dict(color='blue'),
line=dict(color='blue'),
name='Quantiles'
))
# Update layout for better presentation
fig.update_layout(
title='Quantile Plot for N - All data',
xaxis_title='Quantile Levels',
yaxis_title='Quantile Values',
showlegend=False
)
# Show the plot
fig.show()
# Count of each referrer type
type_summary = df.groupby('type')['n'].sum().reset_index()
type_summary['pc_num'] = type_summary['n']/ type_summary['n'].sum()
type_summary
| type | n | pc_num | |
|---|---|---|---|
| 0 | external | 4845206710 | 0.668648 |
| 1 | link | 2344086977 | 0.323489 |
| 2 | other | 56976431 | 0.007863 |
# Count of each referrer type
type_summary = df_sample.groupby('type')['n'].sum().reset_index()
type_summary['pc_num'] = type_summary['n']/ type_summary['n'].sum()
type_summary
| type | n | pc_num | |
|---|---|---|---|
| 0 | external | 462746676 | 0.657070 |
| 1 | link | 235898815 | 0.334961 |
| 2 | other | 5612729 | 0.007970 |
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_sample, x='n', showfliers=True)
plt.title('Box Plot of Click Counts (n)')
plt.xlabel('Number of Clicks')
plt.show()
top_prev = df_sample.groupby('prev')['n'].sum().reset_index()
top_prev['pc_num'] = top_prev['n'] / top_prev['n'].sum() *100.0
top_prev = top_prev.sort_values(by='n', ascending=False).head(10) # Get top 10
print("Top 10 Prev Articles")
print(top_prev)
Top 10 Prev Articles
prev n pc_num
899861 other-search 342548946 48.639680
899857 other-empty 102526357 14.558063
899859 other-internal 10742881 1.525418
899858 other-external 5197126 0.737957
526857 Main_Page 3070536 0.435996
899860 other-other 1731366 0.245842
87946 Animal_(2023_film) 1047571 0.148748
458058 Leave_the_World_Behind_(film) 485666 0.068961
865302 Von_Erich_family 347917 0.049402
229464 Deaths_in_2023 272711 0.038723
fig_prev = px.bar(top_prev, x='prev', y='n',
title='Top 10 Prev Articles by Total Clicks',
labels={'prev': 'Previous Article', 'n': 'Number of Clicks'},
text='pc_num')
fig_prev.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_prev.update_layout(yaxis_title='Number of Clicks', xaxis_title='Previous Article')
fig_prev.show()
df_sample_internal = df_sample[~df_sample['prev'].str.contains('other', case=False)]
top_prev_int = df_sample_internal.groupby('prev')['n'].sum().reset_index()
top_prev_int['pc_num'] = top_prev_int['n'] / top_prev_int['n'].sum() * 100.0
top_prev_int = top_prev_int.sort_values(by='n', ascending=False).head(10)
fig_prev = px.bar(top_prev_int, x='prev', y='n',
title='Top 10 Prev Articles by Total Clicks (Internal)',
labels={'prev': 'Previous Article', 'n': 'Number of Clicks'},
text='pc_num')
fig_prev.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_prev.update_layout(yaxis_title='Number of Clicks', xaxis_title='Previous Article')
fig_prev.show()
top_curr = df_sample.groupby('curr')['n'].sum().reset_index()
top_curr['pc_num'] = top_curr['n'] / top_curr['n'].sum() * 100.0
top_curr = top_curr.sort_values(by='n', ascending=False).head(10)
print("\nTop 10 Current")
top_curr
Top 10 Current
| curr | n | pc_num | |
|---|---|---|---|
| 873253 | List_of_highest-grossing_Indian_films | 1918543 | 0.272420 |
| 1577639 | Wonka_(film) | 1366654 | 0.194056 |
| 1463198 | Timothée_Chalamet | 1361765 | 0.193362 |
| 327387 | Christmas | 982212 | 0.139468 |
| 417882 | Dick_Van_Dyke | 962672 | 0.136693 |
| 790885 | Killers_of_the_Flower_Moon_(film) | 844372 | 0.119895 |
| 313372 | Chaturbate | 755978 | 0.107344 |
| 198184 | Barry_Keoghan | 722455 | 0.102584 |
| 216581 | Benny_Blanco | 713041 | 0.101247 |
| 105723 | Al_Nassr_FC | 702055 | 0.099687 |
fig_curr = px.bar(top_curr, x='curr', y='n',
title='Top 10 Curr Articles by Total Clicks',
labels={'curr': 'Current Article', 'n': 'Number of Clicks'},
text='pc_num')
fig_curr.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_curr.update_layout(yaxis_title='Number of Clicks', xaxis_title='Current Article')
fig_curr.show()
fig_curr = px.bar(top_curr, x='curr', y='n',
title='Top 10 Curr Articles by Total Clicks',
labels={'curr': 'Current Article', 'n': 'Number of Clicks'},
text='pc_num')
fig_curr.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_curr.update_layout(yaxis_title='Number of Clicks', xaxis_title='Current Article')
fig_curr.show()
top_prev_curr = df_sample.groupby(['prev', 'curr'])['n'].sum().reset_index()
top_prev_curr['pc_num'] = top_prev_curr['n'] / top_prev_curr['n'].sum() * 100.0
top_prev_curr = top_prev_curr.sort_values(by='n', ascending=False).head(10)
top_prev_curr['transition'] = top_prev_curr['prev'] + ' -> ' + top_prev_curr['curr']
top_prev_curr = top_prev_curr[['transition', 'n', 'pc_num']]
print("\nTop 10 Previous x Current Articles:")
top_prev_curr
Top 10 Previous x Current Articles:
| transition | n | pc_num | |
|---|---|---|---|
| 3109957 | other-search -> List_of_highest-grossing_India... | 1905973 | 0.270636 |
| 3281353 | other-search -> Wonka_(film) | 1359886 | 0.193095 |
| 3253427 | other-search -> Timothée_Chalamet | 996931 | 0.141558 |
| 2976831 | other-search -> Christmas | 944780 | 0.134152 |
| 2571000 | other-empty -> Killers_of_the_Flower_Moon_(film) | 817092 | 0.116022 |
| 2973458 | other-search -> Chaturbate | 755968 | 0.107342 |
| 2998541 | other-search -> Dick_Van_Dyke | 746190 | 0.105954 |
| 2945526 | other-search -> Barry_Keoghan | 721735 | 0.102482 |
| 2949687 | other-search -> Benny_Blanco | 712267 | 0.101137 |
| 2523130 | other-empty -> Freelance_(2023_film) | 629005 | 0.089315 |
fig_prev_curr = px.bar(top_prev_curr, x='transition', y='n',
title='Top 10 Prev x Curr Transitions by Total Clicks',
labels={'transition': 'Transition (Prev -> Curr)', 'n': 'Number of Clicks'},
text='pc_num')
fig_prev_curr.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_prev_curr.update_layout(yaxis_title='Number of Clicks', xaxis_title='Transition (Prev -> Curr)')
fig_prev_curr.show()
df_sample_internal = df_sample[~df_sample['prev'].str.contains('other', case=False)]
top_prev_curr_intl = df_sample_internal.groupby(['prev', 'curr'])['n'].sum().reset_index()
top_prev_curr_intl['pc_num'] = top_prev_curr_intl['n'] / top_prev_curr_intl['n'].sum() * 100.0
top_prev_curr_intl = top_prev_curr_intl.sort_values(by='n', ascending=False).head(10)
top_prev_curr_intl['transition'] = top_prev_curr_intl['prev'] + ' -> ' + top_prev_curr_intl['curr']
top_prev_curr_intl = top_prev_curr_intl[['transition', 'n', 'pc_num']]
print("\nTop 10 Previous x Current Articles:")
top_prev_curr_intl
# Visualization for Top 10 Prev x Curr Transitions
fig_prev_curr_intl = px.bar(top_prev_curr_intl, x='transition', y='n',
title='Top 10 Prev x Curr Transitions by Total Clicks',
labels={'transition': 'Transition (Prev -> Curr)', 'n': 'Number of Clicks'},
text='pc_num')
fig_prev_curr_intl.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_prev_curr_intl.update_layout(yaxis_title='Number of Clicks', xaxis_title='Transition (Prev -> Curr)')
fig_prev_curr_intl.show()
Top 10 Previous x Current Articles:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
def tokenize_top500(df, text_column):
all_tokens = [word.lower() for title in df[text_column] for word in re.findall(r'\w+', title)]
# Get top 50 most common
token_counts = Counter(all_tokens)
print("Top 500 most common tokens:", token_counts.most_common(500)) # Show top tokens
return df
# Find top 50 tokens in previous article list
tokenize_top500(df,'prev')
Top 500 most common tokens: [('other', 8819602), ('search', 3882356), ('empty', 3007878), ('internal', 1087731), ('external', 494510), ('main_page', 261821), ('film', 224711), ('c', 138562), ('tv_series', 112505), ('album', 70801), ('disambiguation', 68804), ('2023', 68231), ('_season', 51878), ('s', 51139), ('actor', 50989), ('song', 49344), ('band', 47565), ('_california', 31693), ('a', 31399), ('2022', 26910), ('_', 26437), ('footballer', 26070), ('i', 23867), ('musician', 23213), ('novel', 21653), ('video_game', 20734), ('united_states', 19345), ('_texas', 17644), ('_new_york', 17238), ('actress', 17028), ('singer', 16866), ('k', 16841), ('d', 16838), ('s_basketball_team', 16657), ('st', 16325), ('j', 15969), ('e', 15537), ('franchise', 15450), ('m', 15377), ('_men', 15255), ('t', 15109), ('l', 14749), ('2020', 14400), ('soundtrack', 14282), ('f', 14227), ('character', 13976), ('2021', 13962), ('u', 13314), ('season_1', 13211), ('_florida', 13188), ('american_football', 13124), ('2019', 13061), ('r', 13017), ('given_name', 12836), ('anti', 12794), ('india', 12767), ('company', 12740), ('american_tv_series', 12727), ('al', 12557), ('season_2', 11943), ('_pennsylvania', 11754), ('wrestler', 11596), ('billboard_year', 11550), ('present', 11408), ('basketball', 11383), ('wikipedia', 11241), ('politician', 11190), ('2000', 10819), ('miniseries', 10798), ('season_3', 10726), ('2', 10566), ('surname', 10488), ('musical', 10470), ('1', 10446), ('g', 10422), ('_illinois', 10283), ('p', 10273), ('comics', 10132), ('film_series', 10070), ('o', 10058), ('_ohio', 9730), ('star_trek', 9662), ('mr', 9661), ('name', 9592), ('2018', 9468), ('2017', 9329), ('b', 9316), ('n', 9199), ('2010', 9137), ('season_4', 9029), ('2023_film', 8956), ('2009', 8944), ('_j', 8854), ('_georgia', 8770), ('1999', 8533), ('s_basketball', 8458), ('_virginia', 8387), ('_new_jersey', 8321), ('h', 8265), ('2013', 8093), ('2015', 8080), ('2016', 8062), ('2014', 8056), ('_north_carolina', 7954), ('_women', 7936), ('list_of_highest', 7675), ('2011', 7666), ('season_5', 7549), ('_massachusetts', 7501), ('2022_film', 7442), ('_michigan', 7438), ('list_of_best', 7273), ('rapper', 7271), ('uk_parliament_constituency', 7270), ('2019_film', 7233), ('women', 7149), ('jean', 7120), ('_d', 7113), ('united_kingdom', 7075), ('british_tv_series', 7045), ('2012', 7034), ('z', 7032), ('mythology', 6996), ('it', 6965), ('mercedes', 6940), ('list_of_u', 6822), ('v', 6692), ('_r', 6578), ('director', 6542), ('_episodes', 6503), ('_tennessee', 6471), ('people', 6373), ('dr', 6257), ('season_6', 6211), ('2001', 6198), ('_washington', 6122), ('x', 6117), ('don', 6100), ('deaths_in_2023', 6054), ('2008', 6040), ('2016_film', 6037), ('2007', 5971), ('3', 5963), ('spider', 5949), ('_characters', 5944), ('2018_film', 5893), ('2005', 5878), ('doctor_who', 5849), ('state', 5805), ('2006', 5787), ('_s', 5691), ('2015_film', 5579), ('4', 5574), ('2010_film', 5565), ('s_singles', 5542), ('play', 5527), ('_alabama', 5458), ('2017_film', 5449), ('5', 5400), ('2003', 5394), ('_indiana', 5392), ('magazine', 5357), ('2021_film', 5349), ('number', 5319), ('tv', 5306), ('_missouri', 5238), ('1990', 5205), ('man', 5184), ('non', 5159), ('book', 5093), ('2011_film', 5080), ('_k', 5067), ('in', 5043), ('_inc', 5032), ('2013_film', 5021), ('marvel_comics', 5012), ('_a', 5008), ('_colorado', 5006), ('2012_film', 4934), ('israel', 4918), ('indo', 4913), ('2005_film', 4912), ('2014_film', 4873), ('ep', 4828), ('1989', 4828), ('anglo', 4820), ('_arizona', 4819), ('2004', 4805), ('2006_film', 4789), ('s_national_football_team', 4787), ('_kentucky', 4768), ('_wisconsin', 4747), ('w', 4744), ('california', 4710), ('season_7', 4683), ('class_submarine', 4668), ('france', 4659), ('men', 4653), ('2002', 4627), ('_mississippi', 4626), ('self', 4625), ('_minnesota', 4621), ('_p', 4541), ('writer', 4506), ('music', 4495), ('class_destroyer', 4453), ('_m', 4449), ('2004_film', 4387), ('2009_film', 4368), ('professional_wrestling', 4334), ('to', 4332), ('_maryland', 4314), ('_oregon', 4296), ('2002_film', 4295), ('high', 4288), ('2007_film', 4281), ('1997_film', 4256), ('software', 4229), ('china', 4206), ('_louisiana', 4173), ('_connecticut', 4169), ('ice_hockey', 4139), ('russia', 4130), ('businessman', 4126), ('_iowa', 4108), ('all', 4087), ('post', 4083), ('australia', 4067), ('7', 4061), ('2008_film', 4047), ('_south_carolina', 4036), ('comedian', 4031), ('baseball', 4013), ('1945', 3996), ('1996_film', 3992), ('_v', 3991), ('series', 3970), ('_b', 3970), ('japan', 3958), ('mathematics', 3953), ('1998', 3936), ('1996', 3911), ('1995', 3890), ('2020_film', 3862), ('1997', 3847), ('_ontario', 3816), ('the_x', 3809), ('_c', 3805), ('1995_film', 3791), ('star_wars', 3777), ('_los_angeles', 3761), ('fighter', 3738), ('_london', 3719), ('1939', 3710), ('1998_film', 3689), ('programming_language', 3685), ('germany', 3676), ('batman', 3675), ('canada', 3674), ('2000_film', 3653), ('6', 3632), ('law_', 3630), ('or', 3608), ('season_8', 3605), ('_oklahoma', 3599), ('8', 3583), ('1992', 3580), ('_the_movie', 3556), ('_alaska', 3539), ('united_states_relations', 3512), ('no', 3510), ('_players', 3501), ('earth', 3498), ('iran', 3482), ('1994', 3481), ('hamas_war', 3476), ('list_of_wars', 3466), ('état', 3464), ('2001_film', 3464), ('1999_film', 3460), ('boxer', 3446), ('s_basketball_tournament', 3435), ('afro', 3420), ('_g', 3407), ('10', 3398), ('list_of_warner_bros', 3390), ('missile', 3380), ('_state', 3369), ('_order', 3351), ('manga', 3348), ('dc_comics', 3345), ('_arkansas', 3308), ('class_frigate', 3302), ('_kansas', 3301), ('marvel_cinematic_universe', 3300), ('2003_film', 3291), ('king', 3273), ('9', 3265), ('star_game', 3264), ('1979', 3261), ('files', 3210), ('1991', 3208), ('_t', 3191), ('on', 3189), ('pre', 3182), ('group', 3163), ('s_national_basketball_team', 3144), ('saint', 3128), ('wiki', 3124), ('pakistan', 3120), ('indonesia', 3081), ('uk', 3056), ('_new_mexico', 3034), ('_new_south_wales', 3024), ('1980', 3023), ('proto', 3022), ('cricketer', 2997), ('journalist', 2984), ('city', 2982), ('_war', 2964), ('1993', 2938), ('24', 2925), ('de', 2904), ('the', 2899), ('_west_virginia', 2881), ('author', 2880), ('1989_film', 2859), ('royal_rumble_', 2855), ('1994_film', 2852), ('1993_film', 2838), ('one', 2837), ('you', 2830), ('2024', 2824), ('class_cruiser', 2815), ('_w', 2814), ('_maine', 2768), ('filmmaker', 2750), ('composer', 2745), ('washington', 2742), ('rugby_union', 2715), ('12', 2713), ('covid', 2709), ('world_war_ii', 2701), ('philippines', 2699), ('1940', 2696), ('region', 2695), ('new_york', 2682), ('neo', 2679), ('food', 2673), ('ireland', 2671), ('_seasons', 2650), ('s_tournament', 2639), ('bb', 2636), ('italy', 2621), ('texas', 2619), ('rolls', 2616), ('rupaul', 2599), ('0', 2592), ('20', 2572), ('list_of_most', 2557), ('1969', 2543), ('south_korean_tv_series', 2537), ('goldwyn', 2535), ('ukraine', 2533), ('list_of_universal_pictures_films_', 2527), ('afi', 2522), ('1919', 2519), ('class_battleship', 2516), ('poland', 2514), ('spain', 2513), ('doctor_who_', 2496), ('1985_film', 2486), ('season_9', 2481), ('1990_film', 2473), ('grey', 2471), ('america', 2468), ('list_of_marvel_comics_characters', 2463), ('general', 2459), ('producer', 2455), ('1988_film', 2438), ('train', 2433), ('los_angeles', 2428), ('new_york_city', 2427), ('11', 2410), ('_nevada', 2408), ('s_national_handball_team', 2379), ('_l', 2374), ('greatest_hits_', 2351), ('y', 2338), ('tv_channel', 2323), ('moon', 2323), ('florida', 2321), ('_montana', 2320), ('_utah', 2315), ('dancing_with_the_stars_', 2310), ('man_', 2306), ('computing', 2306), ('s_100_years', 2287), ('ufc_fight_night', 2285), ('the_sopranos', 2283), ('_nebraska', 2283), ('sukhoi_su', 2282), ('1970', 2280), ('the_voice_', 2279), ('hungary', 2278), ('let', 2275), ('video_game_series', 2274), ('bird', 2273), ('com', 2265), ('an', 2262), ('_films_', 2260), ('_holy_roman_emperor', 2249), ('brand', 2247), ('ukrainian_war', 2246), ('south_africa', 2246), ('horse', 2237), ('turkey', 2234), ('cv', 2230), ('russo', 2230), ('1992_film', 2220), ('tv_program', 2214), ('1949', 2212), ('malaysia', 2211), ('mexico', 2204), ('two', 2203), ('john_f', 2200), ('wehrmacht', 2200), ('saturday_night_live_', 2199), ('dog', 2195), ('young', 2194), ('1987_film', 2193), ('the_walking_dead', 2192), ('country', 2177), ('doo', 2173), ('brazil', 2169), ('1950', 2169), ('brien', 2153), ('_managers', 2150), ('1981_film', 2135), ('call_of_duty', 2134), ('s_handball_championship', 2128), ('of', 2127), ('30', 2116), ('q', 2114), ('1986_film', 2112), ('_louis', 2095), ('ship', 2090), ('season_10', 2090), ('_1', 2089), ('biology', 2087), ('1985', 2074), ('pan', 2072), ('manhattan', 2071), ('disney', 2067), ('list_of_eastenders_characters_', 2066), ('1978_film', 2059), ('1960', 2052), ('oh', 2050), ('trans', 2048), ('physics', 2047), ('_decorations', 2045), ('warner_bros', 2045), ('scooby', 2041), ('1944', 2041), ('na', 2040), ('1988', 2039), ('_the_next_generation', 2037), ('_h', 2035), ('_india', 2030), ('_2', 2030), ('what', 2027), ('1984_film', 2021), ('list_of_playstation_2_games_', 2017), ('_idaho', 2003), ('list_of_metro', 2001), ('19', 1999), ('1991_film', 1999), ('bangladesh', 1995), ('orders', 1992), ('south_korea', 1992), ('chicago', 1974), ('cross', 1970), ('fm', 1966), ('1974_film', 1965), ('list_of_2023_albums', 1965), ('greece', 1965), ('1982_film', 1965), ('s_law', 1964), ('_e', 1964), ('darts_player', 1962), ('transformers', 1959), ('s_college', 1954), ('list_of_one', 1951), ('can', 1942), ('ss', 1940), ('bigg_boss_', 1935), ('2029', 1933), ('15', 1922), ('south_park', 1920), ('assassin', 1917)]
| prev | curr | type | n | yearmonth | |
|---|---|---|---|---|---|
| 0 | Wethersfield,_Essex | The_Hundred_Parishes | link | 23 | 202312 |
| 1 | History_of_Paris | History_of_Paris_(1946–2000) | link | 16 | 202312 |
| 2 | other-search | The_Hundred_Parishes | external | 23 | 202312 |
| 3 | Paris | History_of_Paris_(1946–2000) | link | 18 | 202312 |
| 4 | other-search | Christian_McNeish | external | 11 | 202312 |
| ... | ... | ... | ... | ... | ... |
| 32927436 | Udaydeva | Dhruvadeva | link | 14 | 202312 |
| 32927437 | other-search | Anne_Davidson | external | 18 | 202312 |
| 32927438 | Duxford | The_Hundred_Parishes | link | 16 | 202312 |
| 32927439 | Essex | The_Hundred_Parishes | link | 10 | 202312 |
| 32927440 | other-empty | The_Hundred_Parishes | external | 42 | 202312 |
32927441 rows × 5 columns
# Find top 50 tokens in current article list
tokenize_top500(df,'curr')
Top 500 most common tokens: [('film', 273703), ('tv_series', 141595), ('c', 141073), ('main_page', 118975), ('album', 101737), ('disambiguation', 80381), ('actor', 78214), ('band', 73718), ('song', 71438), ('footballer', 70691), ('s', 61615), ('_california', 41262), ('video_game', 39209), ('musician', 38267), ('a', 37450), ('novel', 33710), ('_', 33536), ('united_states', 31999), ('american_football', 31654), ('i', 31539), ('st', 30721), ('2023', 28084), ('politician', 27842), ('_season', 26578), ('singer', 26321), ('_new_york', 26104), ('_texas', 25225), ('j', 23371), ('d', 23226), ('basketball', 23211), ('actress', 23077), ('e', 22393), ('t', 21922), ('k', 21838), ('_men', 21816), ('surname', 21284), ('al', 21054), ('s_basketball_team', 20359), ('2022', 19781), ('company', 19276), ('m', 19161), ('_pennsylvania', 19152), ('u', 18744), ('1', 18430), ('r', 18281), ('2', 18187), ('_florida', 18146), ('l', 16810), ('f', 16652), ('comics', 16584), ('wrestler', 16365), ('india', 16227), ('american_tv_series', 15411), ('character', 15173), ('o', 14639), ('_illinois', 14627), ('p', 14622), ('_ohio', 14605), ('anti', 14428), ('_j', 14354), ('b', 14241), ('jean', 14144), ('g', 14047), ('n', 13507), ('_new_jersey', 13457), ('_virginia', 13377), ('magazine', 13373), ('_women', 13166), ('baseball', 12846), ('2021', 12579), ('soundtrack', 12367), ('uk_parliament_constituency', 12000), ('_georgia', 11941), ('given_name', 11656), ('h', 11326), ('united_kingdom', 11295), ('_michigan', 11290), ('mr', 11271), ('2020', 11179), ('_north_carolina', 10951), ('3', 10932), ('play', 10630), ('star_trek', 10501), ('name', 10496), ('tv', 10451), ('musical', 10407), ('rapper', 10403), ('mythology', 10401), ('writer', 10271), ('_massachusetts', 10256), ('ice_hockey', 10226), ('miniseries', 10193), ('_d', 10180), ('v', 10063), ('2019', 9889), ('director', 9840), ('franchise', 9766), ('4', 9726), ('don', 9440), ('ep', 9402), ('5', 9400), ('_washington', 9283), ('it', 9216), ('_missouri', 9178), ('s_basketball', 9169), ('people', 9104), ('_r', 9045), ('_tennessee', 8903), ('s_singles', 8877), ('women', 8856), ('_s', 8617), ('fm', 8593), ('book', 8580), ('2023_film', 8579), ('music', 8499), ('_indiana', 8431), ('2019_film', 8421), ('saint', 8408), ('_alabama', 8391), ('2018', 8381), ('_wisconsin', 8376), ('mercedes', 8366), ('2022_film', 8357), ('2017', 8347), ('_inc', 8282), ('_minnesota', 8162), ('_new_south_wales', 8135), ('w', 7860), ('non', 7814), ('x', 7793), ('in', 7748), ('france', 7723), ('dr', 7665), ('software', 7641), ('california', 7564), ('_k', 7546), ('cricketer', 7530), ('_kentucky', 7416), ('_colorado', 7361), ('2016_film', 7325), ('british_tv_series', 7278), ('2018_film', 7272), ('2016', 7257), ('no', 7185), ('2014', 7138), ('season_2', 7131), ('australia', 7077), ('_maryland', 7062), ('_m', 7029), ('2015', 7003), ('2013', 6990), ('_v', 6986), ('7', 6975), ('2015_film', 6966), ('2010_film', 6953), ('self', 6907), ('journalist', 6906), ('season_1', 6865), ('marvel_comics', 6859), ('_oregon', 6846), ('_iowa', 6828), ('2017_film', 6803), ('state', 6802), ('_arizona', 6799), ('_p', 6749), ('2011', 6672), ('artist', 6660), ('2010', 6534), ('_louisiana', 6509), ('2014_film', 6471), ('_c', 6461), ('author', 6451), ('6', 6443), ('2021_film', 6439), ('businessman', 6416), ('2013_film', 6377), ('rugby_union', 6357), ('2011_film', 6347), ('2012_film', 6330), ('to', 6278), ('_ontario', 6261), ('man', 6223), ('2006_film', 6209), ('2005_film', 6196), ('_london', 6177), ('_a', 6142), ('spider', 6138), ('_mississippi', 6126), ('season_3', 6018), ('film_series', 6008), ('2012', 5975), ('8', 5939), ('_los_angeles', 5922), ('2009_film', 5920), ('manga', 5906), ('de', 5886), ('s_church', 5874), ('japan', 5863), ('boxer', 5857), ('2009', 5834), ('_kansas', 5834), ('present', 5829), ('2001', 5806), ('mathematics', 5798), ('_b', 5780), ('doctor_who', 5778), ('russia', 5736), ('post', 5700), ('anglo', 5672), ('china', 5663), ('_oklahoma', 5661), ('canada', 5649), ('2000', 5648), ('2007_film', 5609), ('germany', 5590), ('covid', 5586), ('class_submarine', 5552), ('israel', 5542), ('_south_carolina', 5517), ('2002_film', 5516), ('association_football', 5512), ('9', 5508), ('_connecticut', 5503), ('_queensland', 5488), ('2008_film', 5465), ('10', 5454), ('professional_wrestling', 5425), ('2004_film', 5417), ('2007', 5413), ('comedian', 5376), ('king', 5353), ('composer', 5352), ('indo', 5351), ('1997_film', 5342), ('am', 5332), ('_arkansas', 5322), ('2008', 5320), ('_west_virginia', 5283), ('washington', 5259), ('programming_language', 5223), ('group', 5205), ('2020_film', 5147), ('producer', 5120), ('missile', 5095), ('horse', 5090), ('high', 5083), ('1996_film', 5083), ('on', 5073), ('season_4', 4965), ('new_york_city', 4963), ('_victoria', 4946), ('star_wars', 4936), ('class_destroyer', 4929), ('world_war_ii', 4902), ('_alaska', 4872), ('dc_comics', 4864), ('com', 4821), ('_g', 4812), ('2005', 4812), ('2000_film', 4750), ('men', 4749), ('uk', 4748), ('all', 4740), ('2006', 4718), ('series', 4704), ('1995_film', 4681), ('1999_film', 4645), ('biology', 4607), ('état', 4532), ('_maine', 4521), ('fighter', 4496), ('1998_film', 4483), ('1999', 4456), ('_new_mexico', 4439), ('italy', 4412), ('river', 4408), ('_w', 4391), ('spain', 4324), ('12', 4320), ('2001_film', 4315), ('_t', 4309), ('texas', 4307), ('general', 4303), ('racing_driver', 4284), ('2003_film', 4281), ('s_basketball_tournament', 4265), ('you', 4217), ('city', 4204), ('2003', 4148), ('season_5', 4147), ('los_angeles', 4122), ('white', 4118), ('earth', 4100), ('the', 4099), ('_nebraska', 4094), ('south_africa', 4004), ('number', 3998), ('_utah', 3996), ('filmmaker', 3965), ('batman', 3953), ('0', 3905), ('an', 3898), ('z', 3885), ('ukraine', 3885), ('2004', 3885), ('pakistan', 3878), ('new_york', 3867), ('ireland', 3856), ('iran', 3843), ('soviet_union', 3840), ('computing', 3822), ('_h', 3816), ('2002', 3784), ('tv_channel', 3780), ('philippines', 3757), ('_the_movie', 3754), ('region', 3711), ('poland', 3708), ('_montana', 3691), ('red', 3676), ('1994_film', 3655), ('11', 3655), ('food', 3651), ('black', 3648), ('rugby_league', 3646), ('1989_film', 3627), ('one', 3617), ('manhattan', 3616), ('ship', 3595), ('neo', 3594), ('let', 3580), ('turkey', 3569), ('_nevada', 3548), ('_l', 3536), ('1993_film', 3524), ('young', 3510), ('brazil', 3506), ('bishop', 3490), ('soccer', 3484), ('1992', 3482), ('chicago', 3460), ('proto', 3454), ('law_', 3450), ('1945', 3442), ('automobile', 3433), ('pre', 3425), ('1988_film', 3420), ('moon', 3415), ('wehrmacht', 3413), ('youtube', 3405), ('_holy_roman_emperor', 3404), ('1996', 3397), ('short_story', 3384), ('ufc_fight_night', 3374), ('season_6', 3366), ('class_frigate', 3352), ('indonesia', 3351), ('1998', 3336), ('florida', 3335), ('class_cruiser', 3323), ('law', 3316), ('1995', 3305), ('y', 3295), ('_co', 3290), ('1997', 3282), ('marvel_cinematic_universe', 3273), ('the_x', 3264), ('two', 3261), ('s_national_football_team', 3254), ('john_f', 3249), ('_2', 3237), ('1990_film', 3237), ('_western_australia', 3222), ('train', 3211), ('german_submarine_u', 3208), ('_characters', 3205), ('mexico', 3199), ('record_producer', 3192), ('military', 3192), ('catholic_church', 3188), ('list_of_best', 3183), ('new_zealand', 3180), ('game', 3173), ('1987_film', 3167), ('austria', 3151), ('na', 3140), ('list_of_marvel_comics_characters', 3139), ('_war', 3138), ('philosophy', 3133), ('1985_film', 3132), ('warner_bros', 3124), ('netherlands', 3114), ('1994', 3113), ('_1', 3107), ('brien', 3103), ('ii', 3102), ('30', 3093), ('rolls', 3090), ('malaysia', 3085), ('s_law', 3080), ('_order', 3079), ('unit', 3067), ('1990', 3064), ('brand', 3035), ('1989', 3029), ('british_army_officer', 3029), ('_n', 3005), ('1992_film', 3002), ('_idaho', 2988), ('south_korea', 2984), ('_new_hampshire', 2982), ('15', 2970), ('ss', 2969), ('physics', 2964), ('list_of_number', 2954), ('psychology', 2946), ('1986_film', 2944), ('_smith', 2936), ('plant', 2931), ('1993', 2919), ('_john', 2916), ('hong_kong', 2912), ('russo', 2911), ('dd', 2909), ('_e', 2902), ('list_of_u', 2897), ('sweden', 2896), ('sur', 2892), ('tennis', 2891), ('of', 2887), ('s_college', 2887), ('town', 2882), ('heckler_', 2880), ('_episodes', 2876), ('london', 2874), ('marie', 2873), ('greece', 2863), ('hungary', 2862), ('_south_dakota', 2847), ('world_war_i', 2841), ('can', 2840), ('god', 2839), ('poet', 2832), ('new_jersey', 2819), ('1991', 2814), ('1981_film', 2814), ('s_tournament', 2805), ('america', 2801), ('south_korean_tv_series', 2800), ('20', 2795), ('1991_film', 2791), ('list_of_highest', 2785), ('_1st_baronet', 2784), ('historian', 2770), ('what', 2762), ('disney', 2750), ('s_theorem', 2750), ('argentina', 2748), ('files', 2742), ('star_game', 2733), ('linguistics', 2730), ('that', 2728), ('bird', 2727), ('oh', 2726), ('1982_film', 2726), ('medicine', 2720), ('egypt', 2720), ('1984_film', 2707), ('dog', 2706), ('cross', 2705), ('restaurant', 2705), ('denmark', 2698), ('_louis', 2681), ('1978_film', 2673), ('journal', 2661), ('_british_columbia', 2654), ('judge', 2652), ('opera', 2644), ('pennsylvania', 2608), ('or', 2600), ('_north_dakota', 2598), ('_johnson', 2595), ('_o', 2592), ('england', 2585), ('american_band', 2581), ('s_party', 2580), ('season_7', 2576), ('michigan', 2574), ('_singles', 2565), ('ohio', 2562), ('bangladesh', 2561), ('norway', 2558), ('massachusetts', 2555), ('_vermont', 2555), ('_kennedy', 2551), ('1980_film', 2551), ('wide_receiver', 2550), ('s_disease', 2550), ('and', 2546), ('s_national_basketball_team', 2536)]
| prev | curr | type | n | yearmonth | |
|---|---|---|---|---|---|
| 0 | Wethersfield,_Essex | The_Hundred_Parishes | link | 23 | 202312 |
| 1 | History_of_Paris | History_of_Paris_(1946–2000) | link | 16 | 202312 |
| 2 | other-search | The_Hundred_Parishes | external | 23 | 202312 |
| 3 | Paris | History_of_Paris_(1946–2000) | link | 18 | 202312 |
| 4 | other-search | Christian_McNeish | external | 11 | 202312 |
| ... | ... | ... | ... | ... | ... |
| 32927436 | Udaydeva | Dhruvadeva | link | 14 | 202312 |
| 32927437 | other-search | Anne_Davidson | external | 18 | 202312 |
| 32927438 | Duxford | The_Hundred_Parishes | link | 16 | 202312 |
| 32927439 | Essex | The_Hundred_Parishes | link | 10 | 202312 |
| 32927440 | other-empty | The_Hundred_Parishes | external | 42 | 202312 |
32927441 rows × 5 columns
## Added hardcoded categories based on tokens that we saw
categories = {
'Film': ['film', 'actor', 'actress', 'director', 'cinema', 'tv_series', 'movie', 'disney', 'marvel_comics',
'soundtrack', 'franchise', 'season', 'series', 'spider', 'star_wars', 'screenplay'],
'Music': ['music', 'song', 'album', 'band', 'singer', 'musician', 'concert', 'genre', 'orchestra', 'rap', 'composer',
'rock', 'pop', 'hip_hop', 'jazz', 'classical', 'punk', 'singer-songwriter'],
'Sports': ['football', 'soccer', 'basketball', 'tennis', 'cricket', 'athlete', 'olympics', 'baseball', 'hockey',
'wrestler', 'racing', 'nba', 'mlb', 'nfl', 'fifa', 'boxing', 'golf', 'ufc', 'cycling', 'rugby'],
'Literature': ['novel', 'book', 'author', 'poem', 'literature', 'writer', 'story', 'drama', 'fiction', 'comics',
'play', 'manga', 'library', 'publication', 'screenplay'],
'Technology': ['technology', 'computer', 'software', 'hardware', 'internet', 'ai', 'robotics', 'programming',
'cybersecurity', 'data', 'machine_learning', 'blockchain', 'cloud', 'startup', 'python', 'java'],
'Science': ['science', 'biology', 'chemistry', 'physics', 'astronomy', 'geology', 'mathematics', 'research',
'medicine', 'dna', 'genetics', 'covid', 'ecosystem', 'energy', 'nasa', 'space', 'climate_change'],
'History': ['history', 'ancient', 'medieval', 'war', 'revolution', 'empire', 'dynasty', 'historical', 'civilization',
'world_war', 'holocaust', 'conflict', 'battle', 'civil_war', 'kingdom', 'renaissance'],
'Geography': ['country', 'city', 'mountain', 'river', 'continent', 'ocean', 'island', 'region', 'california',
'united_states', 'state', 'capital', 'landmark', 'earthquake', 'climate', 'volcano', 'population'],
'Politics': ['politics', 'election', 'government', 'president', 'minister', 'congress', 'senate', 'diplomacy',
'parliament', 'constitution', 'supreme_court', 'campaign', 'voting', 'policy'],
'Business': ['business', 'economy', 'market', 'company', 'finance', 'trade', 'investment', 'startup', 'banking',
'stock_market', 'retail', 'commerce', 'entrepreneurship', 'merger'],
'Entertainment': ['celebrity', 'tv', 'show', 'media', 'theatre', 'festival', 'award', 'netflix', 'sitcom', 'drama',
'comedy', 'reality_tv', 'game_show', 'hollywood', 'bollywood', 'documentary'],
'Education': ['education', 'university', 'college', 'school', 'degree', 'student', 'research', 'professor',
'teacher', 'scholarship', 'academic', 'curriculum', 'exam', 'lecture', 'tuition'],
'Health': ['health', 'medicine', 'disease', 'doctor', 'medical', 'surgery', 'hospital', 'vaccine', 'pandemic',
'mental_health', 'nutrition', 'pharmaceutical', 'wellness', 'nurse', 'emergency'],
'Religion': ['religion', 'church', 'temple', 'faith', 'spirituality', 'god', 'mythology', 'bible', 'islam',
'christianity', 'hinduism', 'buddhism', 'ritual', 'priest', 'sacred'],
'Search': ['search', 'google', 'bing', 'yahoo', 'ask', 'search-engine', 'main_page', 'search_query'],
'Other-external': ['external', 'outside', 'non-wikipedia', 'third_party', 'miscellaneous', 'empty', 'internal'],
'Other-disambiguation': ['disambiguation', 'other', 'non-specified', '2020', '2021', '2022', '2023', '2024']
}
df.head()
| prev | curr | type | n | yearmonth | |
|---|---|---|---|---|---|
| 0 | Wethersfield,_Essex | The_Hundred_Parishes | link | 23 | 202312 |
| 1 | History_of_Paris | History_of_Paris_(1946–2000) | link | 16 | 202312 |
| 2 | other-search | The_Hundred_Parishes | external | 23 | 202312 |
| 3 | Paris | History_of_Paris_(1946–2000) | link | 18 | 202312 |
| 4 | other-search | Christian_McNeish | external | 11 | 202312 |
def category(title):
# convert to lower case
title_lower = title.lower()
## r'\w+ would divide History_of_Paris_(1946–2000) into 'History' 'of' 'Paris' '1946' '2000'
tokens = re.findall(r'\w+', title_lower)
for category, keywords in categories.items():
for keyword in keywords:
if keyword in tokens:
return category
return 'Other'
# Apply to --> curr and prev columns
df['prev_category'] = df['prev'].apply(category)
df['curr_category'] = df['curr'].apply(category)
df[['prev', 'prev_category', 'curr', 'curr_category']].head()
| prev | prev_category | curr | curr_category | |
|---|---|---|---|---|
| 0 | Wethersfield,_Essex | Other | The_Hundred_Parishes | Other |
| 1 | History_of_Paris | Other | History_of_Paris_(1946–2000) | Other |
| 2 | other-search | Search | The_Hundred_Parishes | Other |
| 3 | Paris | Other | History_of_Paris_(1946–2000) | Other |
| 4 | other-search | Search | Christian_McNeish | Other |
# Remove cases whr we are predictiing Other -> Other
df_2 = df[(df['prev_category'] != 'Other') & (df['curr_category'] != 'Other')]
cat_prev = df_2.groupby('prev_category')['n'].sum().reset_index()
cat_prev['pc_num'] = cat_prev['n'] / cat_prev['n'].sum() * 100.0
cat_prev = cat_prev.sort_values(by='n', ascending=False).head(10)
print("\nTop 10 previous")
cat_prev
Top 10 previous
| prev_category | n | pc_num | |
|---|---|---|---|
| 14 | Search | 204458188 | 48.585490 |
| 10 | Other-external | 196743177 | 46.752168 |
| 3 | Film | 8342296 | 1.982383 |
| 9 | Other-disambiguation | 6194713 | 1.472052 |
| 8 | Music | 2968398 | 0.705382 |
| 7 | Literature | 725751 | 0.172461 |
| 4 | Geography | 427921 | 0.101687 |
| 15 | Sports | 349433 | 0.083036 |
| 13 | Science | 184898 | 0.043937 |
| 12 | Religion | 144305 | 0.034291 |
fig = px.pie(cat_prev, names='prev_category', values='n',
title='Top 10 Previous Article Categories',
hole=0.8,
labels={'prev_category': 'Previous Category', 'n': 'Number of Clicks'})
fig.update_traces(textinfo='percent+label')
# Display the plot
fig.show()
cat_curr = df_2.groupby('curr_category')['n'].sum().reset_index()
cat_curr['pc_num'] = cat_curr['n'] / cat_curr['n'].sum() * 100.0
cat_curr = cat_curr.sort_values(by='n', ascending=False).head(10)
print("\nTop 10 Current")
cat_curr
Top 10 Current
| curr_category | n | pc_num | |
|---|---|---|---|
| 3 | Film | 181834483 | 43.209409 |
| 14 | Search | 132162399 | 31.405809 |
| 8 | Music | 43696578 | 10.383637 |
| 9 | Other-disambiguation | 16787996 | 3.989339 |
| 7 | Literature | 12922970 | 3.070891 |
| 15 | Sports | 7686753 | 1.826607 |
| 4 | Geography | 6484181 | 1.540839 |
| 13 | Science | 4354420 | 1.034743 |
| 0 | Business | 4144505 | 0.984861 |
| 12 | Religion | 3302592 | 0.784796 |
fig = px.pie(cat_curr, names='curr_category', values='n',
title='Top 10 Current Article Categories',
hole=0.8,
labels={'prev_category': 'Current Category', 'n': 'Number of Clicks'})
fig.update_traces(textinfo='percent+label')
fig.show()
transition = df_2.groupby(['prev_category', 'curr_category'])['n'].sum().reset_index()
transition['pc_num'] = transition['n'] / transition['n'].sum() * 100.0
transition = transition.sort_values(by='n', ascending=False).head(10)
transition['transition'] = transition['prev_category'] + ' -> ' + transition['curr_category']
transition = transition[['transition', 'n', 'pc_num']]
print("\nTop 10 Previous x Current Articles:")
transition
Top 10 Previous x Current Articles:
| transition | n | pc_num | |
|---|---|---|---|
| 212 | Search -> Film | 132942169 | 31.591106 |
| 162 | Other-external -> Search | 129516908 | 30.777160 |
| 151 | Other-external -> Film | 41052990 | 9.755440 |
| 217 | Search -> Music | 31238452 | 7.423207 |
| 156 | Other-external -> Music | 9227698 | 2.192782 |
| 216 | Search -> Literature | 8549996 | 2.031739 |
| 218 | Search -> Other-disambiguation | 7214139 | 1.714299 |
| 47 | Film -> Film | 6586954 | 1.565261 |
| 224 | Search -> Sports | 5747166 | 1.365702 |
| 140 | Other-disambiguation -> Other-disambiguation | 4843084 | 1.150864 |
fig = px.pie(transition, names='transition', values='n',
title='Top 10 transitions',
hole=0.8,
labels={'transition': 'Transition', 'n': 'Number of Clicks'})
fig.update_traces(textinfo='percent+label')
fig.update_layout(
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.2,
xanchor="right",
x=0.02
)
)
fig.show()
# Top 10 Prev x Curr Transitions
transition = px.bar(transition, x='transition', y='n',
title='Top 10 Prev x Curr Transitions by Total Clicks',
labels={'transition': 'Transition (Prev -> Curr)', 'n': 'Number of Clicks'},
text='pc_num')
transition.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
transition.update_layout(yaxis_title='Number of Clicks', xaxis_title='Transition (Prev -> Curr)')
transition.show()
df_2.columns
Index(['prev', 'curr', 'type', 'n', 'yearmonth', 'prev_category',
'curr_category'],
dtype='object')
# One-hot encode 'prev_category'
df_2_onehot = pd.get_dummies(df_2, columns=['prev_category'], prefix='prev')
# Ensure the target variable is encoded (curr_category)
le_curr_cat = LabelEncoder()
df_2['curr_cat_encoded'] = le_curr_cat.fit_transform(df_2['curr_category'])
/var/folders/kv/k6mf32494kq6g6stjg6pqb2c0000gn/T/ipykernel_61431/4170714293.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Keep only the relevant columns in X
X = df_2_onehot[[
'prev_Business',
'prev_Education',
'prev_Entertainment',
'prev_Film',
'prev_Geography',
'prev_Health',
'prev_History',
'prev_Literature',
'prev_Music',
'prev_Other-disambiguation',
'prev_Other-external',
'prev_Politics',
'prev_Religion',
'prev_Science',
'prev_Search',
'prev_Sports',
'prev_Technology'
]]
# Target variable
y = df_2['curr_cat_encoded']
# Normalize N
scaler = MinMaxScaler()
df_2['n_normalized'] = scaler.fit_transform(df_2[['n']])
sample_weights = df_2['n_normalized']
/var/folders/kv/k6mf32494kq6g6stjg6pqb2c0000gn/T/ipykernel_61431/447046820.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
X, y, sample_weights, test_size=0.3, random_state=42)
logreg = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
# Train the model with sample weights
logreg.fit(X_train, y_train, sample_weight=sw_train)
LogisticRegression(max_iter=1000, multi_class='multinomial')
# Make predictions
y_pred = logreg.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred, sample_weight=sw_test)
print(f"Logistic Regression Accuracy (Weighted by n): {accuracy}%")
Logistic Regression Accuracy (Weighted by n): 0.570957735280439%
all_labels = sorted(df_2['curr_cat_encoded'].unique())
# Print classification report with explicit labels
print("Classification Report (Weighted):")
print(classification_report(y_test, y_pred, labels=all_labels, target_names=le_curr_cat.inverse_transform(all_labels), sample_weight=sw_test))
logreg.classes_
Classification Report (Weighted):
precision recall f1-score support
Business 0.00 0.00 0.00 0.01028842126389789
Education 0.00 0.00 0.00 0.0025603536085672244
Entertainment 0.00 0.00 0.00 0.004191303754626686
Film 0.67 0.81 0.74 0.49351773267830856
Geography 0.00 0.00 0.00 0.013208342960775575
Health 0.00 0.00 0.00 0.0011476876722362837
History 0.00 0.00 0.00 0.0013017130273135769
Literature 0.00 0.00 0.00 0.032661910748689175
Music 0.00 0.00 0.00 0.10414401378106876
Other-disambiguation 0.00 0.00 0.00 0.033396677457521624
Other-external 0.00 0.00 0.00 8.13215683302338e-06
Politics 0.00 0.00 0.00 0.0012169344692365119
Religion 0.00 0.00 0.00 0.008547407149041654
Science 0.00 0.00 0.00 0.011695490169210156
Search 0.28 0.98 0.43 0.058911039669582485
Sports 0.00 0.00 0.00 0.019378962656806678
Technology 0.00 0.00 0.00 0.007548896815309116
accuracy 0.57 0.803725020039025
macro avg 0.06 0.11 0.07 0.803725020039025
weighted avg 0.43 0.57 0.48 0.803725020039025
/Users/devpal/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. /Users/devpal/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. /Users/devpal/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
coeff = logreg.coef_[le_curr_cat.transform(['Film'])[0]]
# Combine prev_category names with coefficients and print them
for category, coef in zip(X.columns,coeff):
print(f"{category}: {coef}")
prev_Business: 5.35245881597101e-05 prev_Education: -1.6234997999897195e-05 prev_Entertainment: -6.763278224910709e-05 prev_Film: 0.01733543401904624 prev_Geography: -0.000905152951849518 prev_Health: -6.702704455477824e-06 prev_History: -2.4294485850290654e-05 prev_Literature: 0.0009160080945909427 prev_Music: -0.005217760029471664 prev_Other-disambiguation: -0.010313226213083494 prev_Other-external: -0.18064113141953117 prev_Politics: -3.350905614849994e-05 prev_Religion: -0.00027057936328343075 prev_Science: -0.00036879936427390163 prev_Search: 0.18018090167703135 prev_Sports: -0.000570535755521401 prev_Technology: -6.345910681447646e-05
Google / Bing Search: 0.1801 — This strong positive coefficient shows that if the previous article is categorized as a 'Search' (search engines or navigation pages)
Business: 5.352e-05 — This is a very small positive coefficient, meaning that if the previous article is in the 'Business' category,
Film : 0.0173This positive coefficient indicates that if the previous article is in the 'Film' category, it slightly increases the likelihood that the next article will also be in 'Film'
Other-external: -0.1806 — This negative coefficient is significant, meaning if the previous article is in 'Other-external' (likely external or miscellaneous content), it greatly decreases the likelihood that the next article will be in 'Film'.
Film
We've also predicted Search well at 99% recall (with almost no FN but high number of FP)
Attempt two things seem interesting at this point